library(ggplot2)
library(googlesheets4)
library(googledrive)
library(tidyverse)
library(summarytools)
library(car)
library(Hmisc)
library(readxl)
library(ggfortify)
library(flowCore)
library(lubridate)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
library(ggwordcloud)
library(tm)
library(factoextra)
library(uwot)
library(igraph)
mean_sd = function(x){
return(mean_sdl(x, mult = 1))
}
set.seed(12345)
drive_auth(use_oob = T, cache = T, email = "rogangrant2022@u.northwestern.edu") # have to run in console :(
gs4_auth(token = drive_token(), use_oob = T, cache = T, email = "rogangrant2022@u.northwestern.edu")
data = read_sheet("https://docs.google.com/spreadsheets/d/1KHgJ-ZXQAfgwp-X1U2xjOiYEa--YrR6cGV3U20TxTXo/edit?usp=sharing",
skip = 0,
trim_ws = T,
.name_repair = "universal",
sheet = "2020-6-14_update")
## Reading from "COVID19 BAL stats"
## Range "'2020-6-14_update'"
#remove in-progress entries
data = subset(data, !is.na(Sample))
#mark neutrophilic patients
data$neutrophilic = data$percent_neutrophils > 40
#for subsetting later
observations = table(data$ID)
serial_patients = names(observations[observations > 1])
#adjust types
data$ID = as.character(data$ID)
rna_metadata = read_sheet("https://docs.google.com/spreadsheets/d/15aq9UMRtTl5E75E5M0mXvbFfxVVuJxrXneKYpo6LzlE/edit#gid=897409272",
trim_ws = T,
.name_repair = "universal",
sheet = "SCRIPT_01 RNAseq Metadata",
na = c("", "NA"))
## Reading from "SCRIPT_01_RNAseq"
## Range "'SCRIPT_01 RNAseq Metadata'"
## New names:
## * `` -> ...17
## * `` -> ...19
## * `Kit lot number` -> Kit.lot.number
## * `Elution vol (ul)` -> Elution.vol..ul.
## * `Date of RNA QC` -> Date.of.RNA.QC
## * ...
#clean up colnames
colnames(rna_metadata) = gsub("\\.", "_", colnames(rna_metadata)) #I like underscores
colnames(rna_metadata) = gsub("_+$", "", colnames(rna_metadata)) # remove trailing
colnames(rna_metadata) = gsub("_+", "_", colnames(rna_metadata)) #remove dup underscores
#tube is randomly read in as a list. fix this.
rna_metadata$tubeid_macrophRNA = as.character(rna_metadata$tubeid_macrophRNA)
rna_metadata$tubeid_macrophRNA[rna_metadata$tubeid_macrophRNA == "NULL"] = NA
#take out problem cols (added back)
rna_metadata = rna_metadata %>%
dplyr::select(-c(RNAseq_batch, Batch_sample))
#remove preceding zeros
rna_metadata = rna_metadata %>%
mutate_at("tubeid_macrophRNA", function(x){ gsub("^0", "", x) })
#fix dates
rna_metadata$sample_process_dt = as.Date(rna_metadata$sample_process_dt)
#merge with flow data
data$BAL_sample = substring(data$Sample, 10, 20)
data = left_join(data,
rna_metadata,
by = c("BAL_sample" = "tc_pt_study_id"))
# simple_md = read_excel(path = "~/Box/COVID19_BAL_flow/01_data/02_clinical_metadata/extracted_clinical_data/LMN_extracted_clinical_data_update052020.xlsx",
# sheet = "New list",
# .name_repair = "universal")
# simple_md$COVID.status = factor(simple_md$COVID.status)
# simple_md$outcome = factor(ifelse(grepl("deceased", simple_md$Discharged..d.c..or.deceased),
# yes = "deceased",
# no = "discharged"))
# simple_md$Study.ID = factor(as.character(simple_md$Study.ID))
#
# source("~/Documents/GitHub/COVID19_BAL_flow/rgrant/read_clinical_metadata_covid19.R")
# md = read_clinical_metadata_covid19()
# md = subset(md, grepl("\\d{4}-BAL-\\d{2}", Sample..)) #collected samples follow this format
#
edw_endpoints = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT Basic Endpoints.csv",
strip.white = T,
check.names = T,
na.strings = c("", "NA"))
date_cols = colnames(edw_endpoints)[grepl("date", colnames(edw_endpoints), ignore.case = T) |
grepl("\\_dt", colnames(edw_endpoints))]
edw_endpoints = edw_endpoints %>%
mutate_at(.vars = date_cols,
.funs = function(x){
as.Date(x, format = "%m/%d/%y %H:%M", tz = "CST") })
edw_endpoints$pt_study_id = as.character(edw_endpoints$pt_study_id)
#simplify outcome data
edw_endpoints$binned_outcome = factor(vapply(edw_endpoints$discharge_disposition_name,
function(x)
{
if(x == "Expired")
{
return("Deceased")
} else if(grepl("^Home", x))
{
return("Discharged")
} else if(x %in% c("Acute Care Hospital", "Group Home", "Inpatient Hospice",
"Planned Readmission – DC/transferred to acute inpatient rehab",
"Acute Inpatient Rehabilitation", "Long-Term Acute Care Hospital (LTAC)",
"Skilled Nursing Facility or Subacute Rehab Care"))
{
return("Inpatient Facility")
} else if(is.na(x) | x == "unknown")
{
return(as.character(NA))
} else
{
return("Other")
}}, FUN.VALUE = "char"))
edw_endpoints = edw_endpoints %>%
select(-full_name)
# edw_micro = read_excel("~/Box/RGrant/SCRIPT/200526 SCRIPT Microbiology BAL Results.xlsx",
# skip = 7,
# .name_repair = "universal")
# keep_cols = apply(edw_micro, 2, function(x){ !all(is.na(x)) })
# edw_micro = edw_micro[, keep_cols]
# edw_micro$order.datetime = as.Date(edw_micro$order.datetime, origin = "1899-12-30") #excel date format
# edw_micro = subset(edw_micro, !is.na(order.datetime))
# edw_micro$infection_detected = factor(!is.na(edw_micro$organism.name))
# edw_micro$organism.quantity[grepl(">", edw_micro$organism.quantity)] = "10000"
# edw_micro$organism.quantity = as.numeric(edw_micro$organism.quantity)
# #summarize for easy binding, get rid of garbage info
# edw_micro = edw_micro %>%
# group_by(pt.study.id, order.datetime) %>%
# dplyr::summarize(detected_organisms = list(organism.name),
# organism_quantities = list(organism.quantity)) %>%
# rowwise() %>%
# mutate(infection_detected = any(!is.na(detected_organisms)))
#
# #merge all together
# data$sample_id = substring(data$Sample,
# 10,
# 20)
# md$Study.ID = as.character(md$Study.ID)
# #need for joining, fixed after
# edw_micro$order.datetime = as.character(edw_micro$order.datetime)
# md$BAL.Date = as.character(md$BAL.Date)
# data = data %>%
# left_join(., md, by = c("sample_id" = "Sample..", "ID" = "Study.ID")) %>%
# left_join(., simple_md, by = c("ID" = "Study.ID")) %>%
# left_join(., edw_endpoints, by = c("ID" = "pt_study_id")) %>%
# left_join(., edw_micro, by = c("ID" = "pt.study.id", "BAL.Date" = "order.datetime"))
# only_metadata = md %>%
# left_join(., simple_md, by = "Study.ID") %>%
# left_join(., edw_endpoints, by = c("Study.ID" = "pt_study_id")) %>%
# left_join(., edw_micro, by = c("Study.ID" = "pt.study.id", "BAL.Date" = "order.datetime"))
# only_metadata = unique(only_metadata)
# data$BAL.Date = as.Date(data$BAL.Date)
# edw_micro$order.datetime = as.Date(edw_micro$order.datetime)
#add EDW molecular data
edw_molecular = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT and COVID BAL Results.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T) %>%
select(-Name)
#make numeric values numeric
numeric_cols = c("day_of_intubation", "day_of_hospitalization", "RBC_BODY_FLUID", "WBC__BODY_FLUID", "NEUTROPHILS__BODY_FLUID",
"Absolute_Neutrophils", "TOXIC_GRANULATION", "MACROPHAGE_BF", "MONOCYTE_BF", "LYMPH_BF",
"ABSOLUTE_LYMPHOCYTES", "EOSINOPHILS__BODY_FLUID", "ABSOLUTE_EOSINOPHILS", "PLASMA_CELL_BF",
"OTHER_CELLS__BODY_FLUID", "AMYLASE_BF", "WHITE_BLOOD_CELL_COUNT",
"C_Reactive_Protein", "LDH", "CREATINE_KINASE__TOTAL", "PROCALCITONIN", "FERRITIN", "TROPONIN_I",
"Creatinine", "AST__SGOT_", "D_DIMER", "max_daily_temp")
edw_molecular = edw_molecular %>% mutate_at(.vars = numeric_cols, .funs = function(x){
x = gsub(">", "", x)
x = gsub("<", "", x)
x = gsub(",", "", x)
x = as.numeric(x)
return(x)})
#fix test columns
test_cols = colnames(edw_molecular)[c(which(colnames(edw_molecular) == "ASPERGILLUS_GALACTOMANNAN_ANTIGEN_NMH_LFH_"):which(colnames(edw_molecular) == "RESPIRATORY_SYNCYTIAL_VIRUS_RESPAN22"),
which(colnames(edw_molecular) == "STREPTOCOCCUS_PNEUMONIAE_ANTIGEN_URINE_1"),
which(colnames(edw_molecular) == "LEGIONELLA_ANTIGEN__EIA__URINE_1"))]
edw_molecular = edw_molecular %>% mutate_at(.vars = test_cols,
.funs = function(x){
x = factor(ifelse(is.na(x),
yes = NA,
no = ifelse((grepl("Not Detected", x, ignore.case = T) |
grepl("Negative", x, ignore.case = T)),
yes = "Negative",
no = "Positive")))
return(x) })
#remove one strange duplicate entry
edw_molecular = subset(edw_molecular, !(duplicated(paste(edw_molecular$ir_id, edw_molecular$BAL_order_timestamp))))
#format into long form
edw_molecular = edw_molecular %>%
pivot_longer(cols = contains("organism_"),
names_to = "microbiology_parameter",
values_to = "microbiology_value")
edw_molecular$main_microbiology_parameter = factor(gsub("_*\\d", "", edw_molecular$microbiology_parameter))
#flatten these parameters into lists of values
edw_molecular = edw_molecular %>%
group_by(ir_id, BAL_order_timestamp, main_microbiology_parameter) %>%
mutate(microbiology_value_list = list(microbiology_value)) %>% # list column
ungroup() %>%
rowwise() %>%
mutate_at(.vars = "microbiology_value_list", .funs = function(x){ #remove NA
cur = na.omit(x)
if(length(cur) == 0)
{
return(list(NULL))
} else
{
return(list(cur))
}
}) %>%
select(-c(microbiology_parameter, microbiology_value)) %>%
ungroup()
#pivot back into wide form for merging (list values get duplicated, need to fix)
listcols = as.character(unique(edw_molecular$main_microbiology_parameter))
edw_molecular = edw_molecular %>%
pivot_wider(names_from = main_microbiology_parameter,
values_from = microbiology_value_list) %>%
rowwise() %>%
#have to remove duplicated list vals
mutate_at(.vars = listcols, .funs = function(x){
return(list(x[[1]])) }) %>%
ungroup()
edw_molecular$order_accession_num = as.character(edw_molecular$order_accession_num)
#fix dates
date_cols = colnames(edw_molecular)[grepl("date", colnames(edw_molecular), ignore.case = T)]
edw_molecular = edw_molecular %>%
mutate_at(.vars = date_cols, .funs = function(x){
as.Date(x, format = "%m/%d/%y", tz = "CST")} )
#make organism quantity numeric
edw_molecular$organism_quantity = lapply(edw_molecular$organism_quantity,
function(x){
x = gsub(">", "", x)
x = gsub("<", "", x)
x = gsub(",", "", x)
x = as.numeric(x)
if(length(x) == 0)
{
return(NULL)
}
return(x)})
#import BAL data
edw_BAL = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT BAL Related Labs.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T) %>%
select(c(ir_id, pt_study_id, redcap_bal_dt)) %>% #these columns aren't helpful
unique()
edw_BAL$pt_study_id = as.character(edw_BAL$pt_study_id)
edw_BAL$redcap_bal_dt = as.Date(edw_BAL$redcap_bal_dt)
colnames(edw_BAL)[colnames(edw_BAL) == "bal_order_date"] = "BAL_order_date"
#import known COVID patients and all patient IDs
covid_cases = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT_COVID_list.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T,
colClasses = rep("character", 5))
covid_cases$covid_confirmed = T
all_patients = read.csv("~/Box/RGrant/SCRIPT/STU00204868_subjects_06_04_2020.csv",
na.strings = c("", "NA"),
strip.white = T,
check.names = T,
colClasses = rep("character", 10)) %>%
separate_rows(case.number, sep = ", ") #uncollapse ID column
patient_data = full_join(covid_cases,
all_patients,
by = c("clarity_west_mrn" = "nmhc_record_number")) %>%
select(-c(first_name, last_name, address.line.1:email, nmff_record_number, nmh_record_number,
first.name, last.name)) #some are duplicated
colnames(patient_data)[colnames(patient_data) == "case.number"] = "study_id"
#merge metadata
edw_molecular$ir_id = as.character(edw_molecular$ir_id)
edw_endpoints$ir_id = as.character(edw_endpoints$ir_id)
only_metadata = left_join(edw_molecular,
patient_data,
by = c("ir_id")) %>%
select(-MRN) %>%
full_join(.,
edw_endpoints,
by = c("ir_id", "study_id" = "pt_study_id")) %>%
select(-west_mrn)
#fix colnames
colnames(data) = gsub("\\.", "_", colnames(data)) #I like underscores
colnames(data) = gsub("_+$", "", colnames(data)) # remove trailing
colnames(data) = gsub("_+", "_", colnames(data)) #remove dup underscores
colnames(only_metadata) = gsub("\\.", "_", colnames(only_metadata))
colnames(only_metadata) = gsub("_+$", "", colnames(only_metadata))
colnames(only_metadata) = gsub("_+", "_", colnames(only_metadata))
#derive additional metadata
only_metadata$days_from_death = as.numeric(difftime(only_metadata$BAL_order_date, only_metadata$death_date, units = "days"))
only_metadata$covid_confirmed[is.na(only_metadata$covid_confirmed)] = FALSE #may be a safer way to do this
only_metadata$covid_confirmed = factor(only_metadata$covid_confirmed)
only_metadata$ventilator_days = as.numeric(difftime(only_metadata$BAL_order_date, only_metadata$first_intubation_date, units = "days"))
#merge with flow data
merge_date = rep(NA, nrow(data))
for(i in 1:nrow(data))
{
#get patient id and bal order date for each entry
cur = data[i, ]
patient = cur$ID
sample_process_dt = cur$sample_process_dt
latest = as.Date(sample_process_dt)
earliest = as.Date(sample_process_dt - 1) #24hr window
#perform matching (should just be one match per)
matches = subset(only_metadata,
study_id == patient & BAL_order_date >= earliest & BAL_order_date <= latest)
if(nrow(matches) == 0)
{
warning(paste("Unmatched sample warning. Patient:", patient, "Sort date:", sample_process_dt))
next
} else if(nrow(matches) > 1)
{
stop(paste("Error: mutliple matches for single sample. Patient:", patient, "Sort date:", sample_process_dt))
} else
{
merge_date[i] = as.character(matches$BAL_order_date)
}
}
data = cbind(data, merge_date)
data$merge_date = as.Date(data$merge_date)
data = left_join(data,
only_metadata,
by = c("ID" = "study_id", "merge_date" = "BAL_order_date"))
#cast into long form
data = unique(data)
mfi_cols = colnames(data)[grepl("MFI", colnames(data), ignore.case = T)]
percentage_cols = colnames(data)[grepl("percent", colnames(data), ignore.case = T)]
data_long = data %>%
pivot_longer(cols = c(mfi_cols, percentage_cols),
names_to = "flow_parameter",
values_to = "value") %>%
arrange(ID, ventilator_days) #for easy viewing later
data_long$flow_parameter = factor(data_long$flow_parameter)
#output for use with bulk
outname = paste0("~/Box/RGrant/SCRIPT/",
Sys.Date(),
"_",
"SCRIPT_clinical_metadata_processed.rds")
saveRDS(object = only_metadata,
file = outname)
outname = paste0("~/Box/RGrant/SCRIPT/",
Sys.Date(),
"_",
"SCRIPT_flow_plus_clinical_metadata_processed.rds")
saveRDS(object = data,
file = outname)
data
summary_data = only_metadata %>%
select(ir_id:OTHER_CELLS_BODY_FLUID,
AMYLASE_BF,
ASPERGILLUS_GALACTOMANNAN_ANTIGEN_NMH_LFH:D_DIMER,
max_daily_temp,
nmh_mrn:binned_outcome)
dfSummary(summary_data, plain.ascii = FALSE, style = "multiline",
graph.magnif = 0.75, valid.col = FALSE, tmp.img.dir = "/tmp", justify = "c")
percentage_data = subset(data_long, grepl("percent", data_long$flow_parameter))
shapiro.test(percentage_data$value)
##
## Shapiro-Wilk normality test
##
## data: percentage_data$value
## W = 0.74261, p-value < 2.2e-16
hist(percentage_data$value, breaks = 50)
mfi_data = subset(data_long, grepl("MFI", data_long$flow_parameter))
shapiro.test(mfi_data$value)
##
## Shapiro-Wilk normality test
##
## data: mfi_data$value
## W = 0.62702, p-value < 2.2e-16
hist(mfi_data$value, breaks = 50)
Extremely non-normal in both cases. At the very least, we need nonparametric tests. In reality, we probably need non-parametric tests or beta regression. However, this may not be true of individual parameters. First let’s see if transformations help.
serial_data = subset(percentage_data, ID %in% serial_patients &
!(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(serial_data, aes(x = ventilator_days, y = value, fill = flow_parameter)) +
geom_bar(position = "stack", stat = "identity") +
facet_grid(ID ~ .) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 448 rows containing missing values (position_stack).
# ggplot(serial_data, aes(x = days_from_death, y = value, fill = flow_parameter)) +
# geom_bar(position = "stack", stat = "identity") +
# facet_grid(ID ~ .) +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ggplot(serial_data, aes(x = days_from_extubation, y = value, fill = flow_parameter)) +
# geom_bar(position = "stack", stat = "identity") +
# facet_grid(ID ~ .) +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
As expected, there appear to be neutrophilic and non-neutrophilic trajectories captured here.
day0_percentages = subset(percentage_data, ventilator_days == 0 &
!(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(day0_percentages, aes(x = ID, y = value, fill = flow_parameter)) +
geom_bar(position = "stack", stat = "identity") +
facet_grid(. ~ covid_confirmed, scales = "free_x") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
hist(asin(sqrt(percentage_data$value)), breaks = 50)
## Warning in asin(sqrt(percentage_data$value)): NaNs produced
shapiro.test(asin(sqrt(percentage_data$value)))
## Warning in asin(sqrt(percentage_data$value)): NaNs produced
##
## Shapiro-Wilk normality test
##
## data: asin(sqrt(percentage_data$value))
## W = 0.97364, p-value = 1.15e-05
Getting there!
hist(car::logit(percentage_data$value), breaks = 50)
## Warning in car::logit(percentage_data$value): proportions remapped to (0.025,
## 0.975)
shapiro.test(car::logit(percentage_data$value))
## Warning in car::logit(percentage_data$value): proportions remapped to (0.025,
## 0.975)
##
## Shapiro-Wilk normality test
##
## data: car::logit(percentage_data$value)
## W = 0.89537, p-value < 2.2e-16
This is getting very close. Let’s see if individual observations are normal.
percentage_data$logit = car::logit(percentage_data$value)
## Warning in car::logit(percentage_data$value): proportions remapped to (0.025,
## 0.975)
percentage_data %>%
group_by(flow_parameter) %>%
dplyr::summarize(pval = shapiro.test(logit)$p.value)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(percentage_data, aes(x = logit)) +
geom_histogram() +
facet_wrap(~ flow_parameter)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can probably work with this, actually. Ideally we should talk to a statistician at some point before publication. Use normalized values for all stats.
hist(log10(mfi_data$value), breaks = 50)
## Warning in hist(log10(mfi_data$value), breaks = 50): NaNs produced
shapiro.test(log10(mfi_data$value))
## Warning in stopifnot(is.numeric(x)): NaNs produced
##
## Shapiro-Wilk normality test
##
## data: log10(mfi_data$value)
## W = 0.97248, p-value = 1.942e-13
Trimodal? But within individual parameters this should be okay. .
mfi_data$log10 = log10(mfi_data$value)
## Warning: NaNs produced
mfi_data %>%
group_by(flow_parameter) %>%
dplyr::summarize(pval = shapiro.test(log10)$p.value)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(mfi_data, aes(x = log10)) +
geom_histogram() +
facet_wrap(~ flow_parameter)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Looks pretty good.
data_wide_percentage = percentage_data %>%
pivot_wider(names_from = flow_parameter,
values_from = c(logit, value))
data_wide_percentage$covid_confirmed[is.na(data_wide_percentage$covid_confirmed)] = FALSE
data_wide_percentage$covid_confirmed = factor(data_wide_percentage$covid_confirmed)
data_wide_percentage$response = data_wide_percentage %>%
select(logit_percent_CD4_total:logit_percent_others)
mancova_data = data_wide_percentage %>%
select(c(logit_percent_CD4_total:logit_percent_others,
covid_confirmed, ventilator_days)) %>%
na.omit()
mancova_fit = manova(cbind(logit_percent_CD4_total, logit_percent_CD4_Tregs, logit_percent_CD4_non_Tregs,
logit_percent_CD8_total, logit_percent_CD206_total, logit_percent_macs_CD206_high,
logit_percent_total_CD206_high, logit_percent_macs_CD206_low, logit_percent_total_CD206_low,
logit_percent_neutrophils, logit_percent_monocytes, logit_percent_others)
~ covid_confirmed * Outcomes,
na.action = "na.omit",
data = mancova_data) #add outcomes??
summary.aov(mancova_fit)
These are interesting results! With the exception of monocytes, abundance of all cells is affected by COVID-19 in some way. For CD4T, and CD8T, this appears to be purely a main effect. For Tregs and putative MoAM, there is also an interaction effect of day and infection. For TRAM, there is only an interaction effect.
Area plot
area_data = percentage_data %>%
group_by(covid_confirmed, ventilator_days, flow_parameter) %>%
dplyr::summarize(mean_pct = mean(value))
## `summarise()` regrouping output by 'covid_confirmed', 'ventilator_days' (override with `.groups` argument)
area_data = subset(area_data, !(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(area_data, aes(x = ventilator_days, y = mean_pct, fill = flow_parameter)) +
geom_area(alpha = 0.7) +
facet_grid(covid_confirmed ~ .)
## Warning: Removed 16 rows containing missing values (position_stack).
This give a false impression between samples. A line plot may actually be better.
Grouped
ggplot(percentage_data, aes(x = ventilator_days, y = value, color = flow_parameter)) +
stat_summary(fun = mean, geom = "line") +
stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.5, alpha = 0.5) +
facet_grid(binned_outcome ~ covid_confirmed)
By patient
ggplot(percentage_data, aes(x = ventilator_days, y = value, color = flow_parameter)) +
geom_line() +
facet_wrap(~ ID)
Not enough data at this point to glean much of anything.
#cast into matrix of patient x flow parameter
all_day0 = data %>%
dplyr::filter(ventilator_days == 0 & (is.na(ventilator_days) | ventilator_days == 0)) %>%
select(ID, percent_CD4_total:percent_others, binned_outcome, covid_confirmed)
annotation_data = all_day0 %>%
select(ID, binned_outcome, covid_confirmed) %>%
remove_rownames() %>%
column_to_rownames(var = "ID")
matrix_data = all_day0 %>%
select(ID, percent_CD4_total:percent_others) %>%
remove_rownames() %>%
column_to_rownames(var = "ID") %>%
as.matrix()
pheatmap(mat = matrix_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
all_days = data %>%
dplyr::filter(!is.na(ventilator_days)) %>%
select(Sample, percent_CD4_total:percent_others, binned_outcome, covid_confirmed, ID, ventilator_days) %>%
arrange(ID, ventilator_days)
annotation_data = all_days %>%
select(Sample, binned_outcome, covid_confirmed, ID, ventilator_days) %>%
remove_rownames() %>%
column_to_rownames(var = "Sample")
matrix_data = all_days %>%
select(Sample, percent_CD4_total:percent_others) %>%
remove_rownames() %>%
column_to_rownames(var = "Sample") %>%
as.matrix()
pheatmap(mat = matrix_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
show_rownames = F,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
pheatmap(mat = matrix_data,
cluster_rows = F,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
show_rownames = F,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
all_days = data %>%
dplyr::filter(!is.na(ventilator_days) & ID %in% serial_patients) %>%
rowwise() %>%
mutate(infection_detected = factor(!is.null(organism_quantity))) %>%
ungroup() %>%
select(Sample, percent_CD4_total:percent_others, binned_outcome,
covid_confirmed, ID, ventilator_days,
C_Reactive_Protein, infection_detected) %>%
arrange(ID, ventilator_days)
annotation_data = all_days %>%
select(Sample, binned_outcome, covid_confirmed,
C_Reactive_Protein, infection_detected, ventilator_days) %>%
remove_rownames() %>%
column_to_rownames(var = "Sample")
matrix_data = all_days %>%
select(Sample, percent_CD4_total:percent_others) %>%
remove_rownames() %>%
column_to_rownames(var = "Sample") %>%
as.matrix()
pheatmap(mat = matrix_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
show_rownames = T,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
breaks = which(!duplicated(all_days$ID)) - 1
pheatmap(mat = matrix_data,
cluster_rows = F,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = annotation_data,
scale = "column",
angle_col = 45,
show_rownames = T,
gaps_row = breaks,
breaks = seq(-3, 3, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))
only_metadata = only_metadata %>%
rowwise() %>%
mutate(infection_detected = !is.null(organism_name)) %>%
ungroup()
only_metadata$infection_detected[is.na(only_metadata$infection_detected)] = "Not Tested"
only_metadata$infection_detected = factor(only_metadata$infection_detected,
levels = c("Not Tested", "TRUE", "FALSE"))
ggplot(subset(only_metadata, !is.na(covid_confirmed)), aes(x = covid_confirmed, fill = infection_detected)) +
geom_bar(position = "fill") +
ylab("Proportion") +
scale_fill_manual(values = c("Not Tested" = "gray", "FALSE" = "cornflowerblue", "TRUE" = "firebrick"))
covid_organism_levels = only_metadata %>%
dplyr::filter(covid_confirmed == T) %>%
select(organism_quantity) %>%
unlist()
other_organism_levels = only_metadata %>%
dplyr::filter(covid_confirmed == F) %>%
select(organism_quantity) %>%
unlist()
ggplot(NULL) +
geom_density(aes(x = covid_organism_levels), fill = "firebrick", alpha = 0.5) +
geom_density(aes(x = other_organism_levels), fill = "cornflowerblue", alpha = 0.5)
covid_organisms = as.character(only_metadata %>%
dplyr::filter(covid_confirmed == T) %>%
select(organism_name) %>%
unlist() %>%
na.omit())
covid_organisms = gsub("[[:punct:]] | | [[:punct:]]|[[:punct:]]", "_", trimws(covid_organisms)) #so we can treat as one word
covid_organisms = gsub("_$", "", covid_organisms)
other_organisms = as.character(only_metadata %>%
dplyr::filter(covid_confirmed == F) %>%
select(organism_name) %>%
unlist() %>%
na.omit())
other_organisms = gsub("[[:punct:]] | | [[:punct:]]|[[:punct:]]", "_", trimws(other_organisms))
other_organisms = gsub("_$", "", other_organisms)
#make into data frame with normalized freqs
df1 = data.frame(word = names(termFreq(covid_organisms)),
freq = as.numeric(termFreq(covid_organisms)) /
sum(as.numeric(termFreq(covid_organisms))),
covid_confirmed = T)
df2 = data.frame(word = names(termFreq(other_organisms)),
freq = as.numeric(termFreq(other_organisms)) /
sum(as.numeric(termFreq(other_organisms))),
covid_confirmed = F)
#finally, plot
wordcould_df = rbind(df1, df2)
ggplot(wordcould_df, aes(label = word, size = freq)) +
geom_text_wordcloud(show.legend = F) +
facet_wrap(~ covid_confirmed)
## Warning in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
## Some words could not fit on page. They have been placed at their original
## positions.
## Warning in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
## Some words could not fit on page. They have been placed at their original
## positions.
Viridans Streptococcus
metagenomics_data_a = only_metadata %>%
unique() %>%
unnest_longer(col = organism_name, simplify = T) %>%
select(-c(Organism_ID, organism_quantity))
metagenomics_data_b = only_metadata %>%
unique() %>%
unnest_longer(col = organism_quantity, simplify = T) %>%
select(-c(Organism_ID))
metagenomics_data_b$organism_quantity[is.na(metagenomics_data_b$organism_quantity)] = 0
metagenomics_data = cbind(metagenomics_data_a, organism_quantity = metagenomics_data_b$organism_quantity) %>%
pivot_wider(names_from = organism_name,
values_from = organism_quantity)
rm(metagenomics_data_a, metagenomics_data_b)
viridans_data = metagenomics_data %>%
select(ir_id, BAL_order_timestamp, covid_confirmed,
`Viridans streptococcus`, `Viridans streptococcus #2`,
`Viridans streptococcus #3`) %>%
pivot_longer(cols = c(`Viridans streptococcus`, `Viridans streptococcus #2`, `Viridans streptococcus #3`),
names_to = "strain", values_to = "organism_quantity")
viridans_data$organism_quantity[is.na(viridans_data$organism_quantity)] = 0
ggplot(viridans_data, aes(x = covid_confirmed, y = organism_quantity, fill = strain)) +
geom_boxplot()
clinical_data = only_metadata %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
RBC_BODY_FLUID:Absolute_Neutrophils,
MACROPHAGE_BF:OTHER_CELLS_BODY_FLUID,
AMYLASE_BF,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id")
safe_cols = function(x, proportion_present = 0.5)
{
return(all(is.finite(na.omit(x))) &
sd(x, na.rm = T) > 0 &
(sum(!is.na(x)) / (length(x)) > proportion_present))
}
keep_cols = apply(clinical_data, 2, safe_cols)
pca_data = na.omit(clinical_data[, keep_cols])
#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
data = pca_data,
scale. = T,
na.action = na.omit)
pca_data_complete = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(pca_data)) %>%
column_to_rownames(var = "sample_id")
fviz_eig(pca)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
shape = "covid_confirmed",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
shape = "covid_confirmed",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
shape = "covid_confirmed",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
Some interesting loadings: PC1-2 is driven heavily by markers of sepsis and/or organ failure. PC3 appears to just broadly be immune response, but I find it interesting that CRP and monocytes (in this case from BAL) travel together. Are they the source of IL6? Is this a response to IL6? PC4 is similar, with influence of CRP and ferritin suggesting high levels of inflammation. Let’s also try UMAP to see if there is some hidden structure.
clinical_data_ltd = only_metadata %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
Absolute_Neutrophils,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id")
keep_cols = apply(clinical_data_ltd, 2, safe_cols)
pca_data = na.omit(clinical_data_ltd[, keep_cols])
#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
data = pca_data,
scale. = T,
na.action = na.omit)
pca_data_complete = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(pca_data)) %>%
column_to_rownames(var = "sample_id")
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
umap = umap(pca_data,
n_neighbors = 15,
min_dist = 0.1,
scale = "Z")
colnames(umap) = c("umap_1", "umap_2")
umap_data = cbind(pca_data_complete, umap)
ggplot(umap_data, aes(x = umap_1, y = umap_2, color = binned_outcome)) +
geom_point() +
stat_ellipse()
## Too few points to calculate an ellipse
## Warning: Removed 1 row(s) containing missing values (geom_path).
This doesn’t seem to add much.
clinical_day0 = only_metadata %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date) & ventilator_days == 0) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
Absolute_Neutrophils,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id") %>%
na.omit()
keep_cols = apply(clinical_day0, 2, safe_cols)
pca_data = clinical_day0[, keep_cols]
pca_data_complete = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(pca_data)) %>%
column_to_rownames(var = "sample_id")
#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
data = pca_data,
scale. = T,
na.action = na.omit)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
Not actually very different from complete dataset.
flow_pca = data %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(merge_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(Sample,
Absolute_Neutrophils,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp,
percent_CD4_total:percent_others)) %>%
column_to_rownames(var = "Sample")
keep_cols = apply(flow_pca, 2, function(x){ safe_cols(x = x, proportion_present = 0.9) })
pca_data = na.omit(flow_pca[, keep_cols])
#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
data = pca_data,
scale. = T,
na.action = na.omit)
pca_data_complete = data %>%
unique() %>%
dplyr::filter(Sample %in% rownames(pca_data)) %>%
column_to_rownames(var = "Sample")
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 1,
y = 2)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 2,
y = 3)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 3,
y = 4)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 4,
y = 5)
autoplot(pca,
data = pca_data_complete,
colour = "binned_outcome",
loadings = TRUE,
loadings.colour = alpha("blue", 0.1),
loadings.label = TRUE,
loadings.label.size = 2,
loadings.label.colour = alpha("red", 0.7),
x = 5,
y = 6)
heatmap_data = only_metadata %>%
unique() %>% #oddly there is duplication
dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
select(c(sample_id,
Absolute_Neutrophils,
WHITE_BLOOD_CELL_COUNT:D_DIMER,
max_daily_temp)) %>%
column_to_rownames(var = "sample_id")
heatmap_metadata = only_metadata %>%
unique() %>%
mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>%
dplyr::filter(sample_id %in% rownames(heatmap_data)) %>%
select(sample_id, ventilator_days, binned_outcome) %>%
column_to_rownames(var = "sample_id")
pheatmap(heatmap_data,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
annotation_row = heatmap_metadata,
scale = "column",
angle_col = 45,
breaks = seq(-5, 5, length.out=101),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))